Load Packages

source("https://bioconductor.org/biocLite.R")
biocLite("AnnotationHub")
library(AnnotationHub)
library(biomaRt)
library(tidyverse) # ggplot2, dplyr, tidyr, readr, purrr, tibble
library(stringr) # string manipulation
library(data.table)

Set directory paths

proj_dir <- "/mnt/hd2/Dropbox (Partners HealthCare)/projects/2017_06_haystack/get_data_script"

#proj_dir <- "/data/pinello/PROJECTS/2017_06_HAYSTACK/haystack_bio/get_data_script"
sample_names_dir <-  file.path(proj_dir,  "sample_names")
#dir.create(sample_names_dir)
output_dir <- file.path(proj_dir,"files_source_urls")

data_dir <- "/data/molpath/common_data/roadmap"
bigwig_dir <- file.path(data_dir, "bigwig") 
rpkm_output_dir <- file.path(data_dir, "gene_expression" ,"rpkm")


#dir.create(rpkm_output_dir)

Download Gene expression metadata

download.file("http://egg2.wustl.edu/roadmap/data/byDataType/rna/expression/EG.name.txt",
              "EG.name.txt")

Read data

eg_names <- fread(input = "EG.name.txt", 
                    header = FALSE, 
                    col.names = c("epigenome", 
                                  "celltypes")) %>% 
  filter(epigenome != "E000" )

eg_names 

Get experiments metada from RoadMap Epigenomics

ah <- AnnotationHub()

# Searching for  epigenomes in the roadmap database

epiFiles <- query(ah, c("EpigenomeRoadMap", "^E"))
epiFiles <- query(epiFiles , c( "H3K27ac.fc.signal.bigwig",
                                "H3K27me3.fc.signal.bigwig",
                                "H3K4me3.fc.signal.bigwig",
                                "DNase.fc.signal.bigwig",
                                "WGBS_FractionalMethylation.bigwig"), 
                  pattern.op= `|`)

epiFiles
## AnnotationHub with 442 records
## # snapshotDate(): 2017-04-25 
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: BigWigFile
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## #   tags, rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH32003"]]' 
## 
##             title                                 
##   AH32003 | E001-H3K4me3.fc.signal.bigwig         
##   AH32006 | E001-H3K27me3.fc.signal.bigwig        
##   AH32009 | E002-H3K4me3.fc.signal.bigwig         
##   AH32012 | E002-H3K27me3.fc.signal.bigwig        
##   AH32014 | E003-DNase.fc.signal.bigwig           
##   ...       ...                                   
##   AH49524 | E105_WGBS_FractionalMethylation.bigwig
##   AH49525 | E106_WGBS_FractionalMethylation.bigwig
##   AH49526 | E109_WGBS_FractionalMethylation.bigwig
##   AH49527 | E112_WGBS_FractionalMethylation.bigwig
##   AH49528 | E113_WGBS_FractionalMethylation.bigwig

Create epigenome-by-mark array

RMEP_dt <- data.table(title= epiFiles$title,
                          sourceurl= epiFiles$sourceurl) %>%
    bind_cols(do(., data.frame(
      str_split(.$title,
                "-|.fc|_WGBS_",
                simplify = TRUE)[,1:2],
      stringsAsFactors = FALSE)))  %>%
  select(-title) %>%
  rename( epigenome= "X1",
          mark= "X2") %>%
     mutate(mark=replace(mark, 
                         mark== "FractionalMethylation.bigwig", 
                         "methylation"))

marks <- RMEP_dt %>% 
  select(mark) %>% 
  unique  %>% 
  unlist

RMEP_dt
epifiles_dt<- RMEP_dt %>% 
  dcast( epigenome ~ mark )
## Using 'mark' as value column. Use 'value.var' to override
epifiles_dt

Filter Epigenomes

Epigenomes that have gene expression data

epifiles_dt  <- left_join(eg_names,
                          epifiles_dt,
                      by='epigenome')

epifiles_dt

Number of epigenomes with gene expression data that have 1, 2, 3, or 4 of the marks (DNase, methylation, H3K27ac, H3K27me3 )

epifiles_dt$marks_num <- epifiles_dt %>%
     select(-epigenome, -celltypes)  %>%
     is.na %>% 
    `!` %>% 
    rowSums 

epifiles_dt

Frequency counts of marks

epifiles_dt %>% 
  select(marks_num) %>% 
  table
## .
##  2  3  4  5 
##  1 13 29 13

Epigenomes with all 5 marks

epifiles_dt %>%
  filter(marks_num==5)

Epigenomes with 4 or 5 marks

selected_epigenomes <- epifiles_dt %>%
  filter(marks_num >=4) %>%
  arrange(marks_num) %>%
  filter(epigenome!="E056")

selected_epigenomes 

Save source urls to file

for (mark_target in marks){
  RMEP_dt  %>% 
  filter(epigenome %in% selected_epigenomes$epigenome) %>% 
  filter(mark == mark_target )  %>% 
  select(sourceurl) %>% 
   fwrite(file= file.path(output_dir, paste0(mark_target,"_urls.txt")),
          sep='\t',
          col.names = FALSE)
}

Preprocess RPKM matrix

ah <- AnnotationHub()
## snapshotDate(): 2017-04-25
RPKM_query <- query(ah, c("EpigenomeRoadMap", "exon.RPKM.pc"))
RPKM_query
## AnnotationHub with 1 record
## # snapshotDate(): 2017-04-25 
## # names(): AH49026
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: data.frame
## # $rdatadateadded: 2015-08-10
## # $title: 57epigenomes.exon.RPKM.pc.gz
## # $description: RPKM expression matrix for protein coding exons
## # $taxonomyid: 9606
## # $genome: hg19
## # $sourcetype: Zip
## # $sourceurl: http://egg2.wustl.edu/roadmap/data/byDataType/rna/express...
## # $sourcesize: 32882572
## # $tags: c("EpigenomeRoadMap", "expression", "quantification",
## #   "hg19") 
## # retrieve record with 'object[["AH49026"]]'
RPKM_dt <- ah[["AH49026"]]
## loading from cache '/home/rick//.AnnotationHub/55332'
## require("SummarizedExperiment")
RPKM_dt
## class: RangedSummarizedExperiment 
## dim: 233480 56 
## metadata(0):
## assays(1): ''
## rownames(233480): chr10:100003848-100004654<1
##   chr10:100007447-100008748<-1 ... chrY:9384257-9384264<-1
##   chrY:9384401-9384693<-1
## rowData names(2): gene_id E000
## colnames(56): E003 E004 ... E127 E128
## colData names(0):
rpkm_genes <- data.frame(gene_id = 
                           rowData(RPKM_dt)$gene_id %>% 
                           as.character,
                         stringsAsFactors = FALSE) %>% 
  cbind( . , 
        RPKM_dt %>% 
        assay() %>% 
        data.frame()
  )
rpkm_genes

Query biomart

Define biomart object

mart <- useMart(biomart = "ensembl", 
                dataset = "hsapiens_gene_ensembl")

#Get gene names
results <- getBM(attributes = c("ensembl_gene_id", 
                                "external_gene_name"),
                 filters = "ensembl_gene_id", 
                 values = rpkm_genes$gene_id,
                 mart = mart) 
results 

rename

results <- results  %>%
  dplyr::rename(gene_id ="ensembl_gene_id")

results 

Add gene names to rpkm data

rpkm_genes <- full_join(rpkm_genes, 
                        results, 
                        by = "gene_id")

rpkm_genes

Compute average of transcipt expression rpkm over genes

rpkm_genes_avg <- rpkm_genes %>%
  select(-gene_id) %>% 
  group_by(external_gene_name) %>% 
  summarise_all(funs(mean))

rpkm_epigenomes <- colnames(rpkm_genes_avg)[-1]

rpkm_genes_avg

Save rpkm file for each epigenome

for (epigenome in rpkm_epigenomes){
  rpkm_genes_avg  %>%
  select(external_gene_name,
         epigenome ) %>% 
  fwrite(file= file.path(rpkm_output_dir,
                         paste0(epigenome, 
                                "_avg_rpkm.txt")),
         sep='\t',
         col.names = FALSE)
}

Create haystack sample name files for all marks

prepare gene expression metadata

rpkm_epigenome_dt <- data.table(epigenome = rpkm_epigenomes,
                                input_name = paste(rpkm_epigenomes, 
                                                  substring(eg_names$celltypes, 1,7), 
                                                  sep='_') , 
                                rpkm_filepath = file.path(rpkm_output_dir,
                                                       paste0(rpkm_epigenomes, 
                                                              "_avg_rpkm.txt")
                                                       )
                                )
rpkm_epigenome_dt

Save sample name metadata to file

for (mark_target in marks){
  sample_names_dt <- RMEP_dt  %>% 
    filter(epigenome %in% selected_epigenomes$epigenome) %>% 
    filter(mark == mark_target )  %>% 
    select(sourceurl, 
           epigenome)  %>% 
    rowwise() %>%
    mutate(sourceurl = basename(sourceurl), 
           filepath =  file.path(bigwig_dir, 
                                mark_target,
                                sourceurl)
           ) %>%
    left_join(. , 
              rpkm_epigenome_dt, 
              by = "epigenome") %>%
    select(input_name,
           filepath,
           rpkm_filepath)
 
   fwrite(
    sample_names_dt,
    file.path(sample_names_dir, 
              paste0("sample_names_",
                     mark_target,
                     ".txt")),
    col.names = FALSE,
    sep = "\t"
  )
}
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.3 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] SummarizedExperiment_1.6.4 DelayedArray_0.2.7        
##  [3] matrixStats_0.52.2         Biobase_2.36.2            
##  [5] GenomicRanges_1.28.5       GenomeInfoDb_1.12.2       
##  [7] IRanges_2.10.3             S4Vectors_0.14.4          
##  [9] bindrcpp_0.2               data.table_1.10.4         
## [11] stringr_1.2.0              dplyr_0.7.3               
## [13] purrr_0.2.3                readr_1.1.1               
## [15] tidyr_0.7.1                tibble_1.3.4              
## [17] ggplot2_2.2.1              tidyverse_1.1.1           
## [19] biomaRt_2.32.1             AnnotationHub_2.8.2       
## [21] BiocGenerics_0.22.0       
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.3.1                    bit64_0.9-7                  
##  [3] jsonlite_1.5                  modelr_0.1.1                 
##  [5] shiny_1.0.5                   assertthat_0.2.0             
##  [7] interactiveDisplayBase_1.14.0 blob_1.1.0                   
##  [9] GenomeInfoDbData_0.99.0       cellranger_1.1.0             
## [11] yaml_2.1.14                   RSQLite_2.0                  
## [13] backports_1.1.0               lattice_0.20-35              
## [15] glue_1.1.1                    digest_0.6.12                
## [17] XVector_0.16.0                rvest_0.3.2                  
## [19] colorspace_1.3-2              Matrix_1.2-11                
## [21] htmltools_0.3.6               httpuv_1.3.5                 
## [23] plyr_1.8.4                    psych_1.7.8                  
## [25] XML_3.98-1.9                  pkgconfig_2.0.1              
## [27] broom_0.4.2                   haven_1.1.0                  
## [29] zlibbioc_1.22.0               xtable_1.8-2                 
## [31] scales_0.5.0                  lazyeval_0.2.0               
## [33] mnormt_1.5-5                  magrittr_1.5                 
## [35] readxl_1.0.0                  mime_0.5                     
## [37] memoise_1.1.0                 evaluate_0.10.1              
## [39] nlme_3.1-131                  forcats_0.2.0                
## [41] xml2_1.1.1                    foreign_0.8-69               
## [43] BiocInstaller_1.26.1          tools_3.4.1                  
## [45] hms_0.3                       munsell_0.4.3                
## [47] AnnotationDbi_1.38.2          compiler_3.4.1               
## [49] rlang_0.1.2                   grid_3.4.1                   
## [51] RCurl_1.95-4.8                bitops_1.0-6                 
## [53] rmarkdown_1.6                 gtable_0.2.0                 
## [55] DBI_0.7                       curl_2.8.1                   
## [57] reshape2_1.4.2                R6_2.2.2                     
## [59] lubridate_1.6.0               knitr_1.17                   
## [61] bit_1.1-12                    bindr_0.1                    
## [63] rprojroot_1.2                 stringi_1.1.5                
## [65] Rcpp_0.12.12